package com.charm.tools;

import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;

import org.bson.Document;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.BufferedReader;
import java.io.BufferedWriter;

import com.charm.utils.CmAnsj;
import com.charm.utils.CmConf;
import com.charm.utils.CmMongo;
import com.charm.utils.CmAnsj.CmWord;

import org.apache.commons.cli.Options;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;

/**
 * 从采样库读取样本分词后保存至文件
 * @author gonglibin
 * 2017.11.28
 */

public class CmSample {
	public static void main(String[] args) {
		try {
			CmMongo mgo = new CmMongo();
			CmConf cfg = new CmConf(CmConf.CM_CMGO);
			mgo.CmMongoOpen(cfg.CmConfGetString(CmConf.CM_CHST), cfg.CmConfGetInt(CmConf.CM_CPOT));
			mgo.CmMongoGetDatabase(cfg.CmConfGetString(CmConf.CM_CMDB));
			mgo.CmMongoGetCollection(cfg.CmConfGetString(CmConf.CM_CMTB));
			
			Options opt = new Options();
			opt.addOption("t", "tag", false, "tag statistic");
			opt.addOption("f", "file", false, "ouput samples");
			opt.addOption("c", "cramp", false, "cramp samples");
			CommandLine cmd = new BasicParser().parse(opt, args);
			
			if (true == cmd.hasOption('t')) {
				CmSampleTagStatistic(mgo);
			}
			else if (true == cmd.hasOption('f')) {
				CmSamplePointOutSample(mgo);
			}
			else if (true == cmd.hasOption('c')) {
				CmSampleCrampOutSample(mgo);
			}
			
			mgo.CmMongoClose();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 统计mongdb中各个分类的文章总数
	 * @param m mongdb数据库句柄
	 * @return 无
	 */
	private static void CmSampleTagStatistic(CmMongo m) throws Exception {
		Integer val = 0;
		Map<String, Integer> map = new HashMap<String, Integer>();
		
		for (Document d : m.CmMongoSelect()) {
			if (null != (val = map.get(d.getString("category")))) {
				map.put(d.getString("category"), ++ val);
			}
			else {
				map.put(d.getString("category"), 1);
			}
		}
		
		for (Map.Entry<String, Integer> v : map.entrySet()) {
			System.out.println(v.getKey() + ": " + v.getValue());
		}
	}
	
	/**
	 * 从mongdb中指定分类文章全部提取出来
	 * @param m mongdb数据库句柄
	 * @return 无
	 */
	private static void CmSamplePointOutSample(CmMongo m) throws Exception {
		final int		CM_LIMT = 2000;
		final String[]	CM_TAGS = {"穿搭", "创业", "电视剧", "读书", "攻略", 	"购物", "管理", "纪实", "留学", "美容", "美妆", "明星", "母婴", "评论", "奢侈品", "生活", "探索", "校园", "艺术", "音乐", "营销", "运动", "整形", "智能", "综艺"};
		
		List<String> cat = new ArrayList<String>();
		for (String v : CM_TAGS) cat.add(v);
		
		CmSampleOutputSample(m, cat, CM_LIMT);
	}
	
	/**
	 * 从mongdb中全部分类文章按指定数抽取不足总数重复补齐
	 * @param m mongdb数据库句柄
	 * @return 无
	 */
	private static void CmSampleCrampOutSample(CmMongo m) throws Exception {
		final int		CM_LIMT = 10000;
		final String	CM_CATS = "./config/CmBasCategory.dat";
		
		String buf = new String();
		List<String> cat = new ArrayList<String>();
		BufferedReader br = new BufferedReader(new FileReader(CM_CATS));
		while (null != (buf = br.readLine())) cat.add(buf.trim());
		br.close();
		
		CmSampleOutputSample(m, cat, CM_LIMT);
	}
	
	/**
	 * 按分类按数量将分词后文章内容写入输出文件
	 * @param m mongdb数据库句柄
	 * @return 无
	 */
	private static void CmSampleOutputSample(CmMongo m, List<String> l, int n) throws Exception {
		final String	CM_FILE = "./dat/sample.dat";
		BufferedWriter bw = new BufferedWriter(new FileWriter(CM_FILE));
		CmAnsj asj = new CmAnsj(CmAnsj.CM_CWDS, CmAnsj.CM_CNTS, CmAnsj.CM_CDIC);
		
		for (String c : l) {
			List<String> doc = new ArrayList<String>();
			for (Document d : m.CmMongoSelect("category", c).limit(n)) {
				String txt = new String();
				for (CmWord w : asj.CmAnsjParse(d.getString("title") + "。" + d.getString("content"))) {
					txt += w.val + " ";
				}
				doc.add(txt + "\t__label__" + d.getString("category") + "\n");
			}
			
			int cnt = 0;
			while (cnt < n) {
				for (String d : doc) {
					if (cnt ++ < n) {
						bw.write(d);
					}
					else {
						break;
					}
				}
			}
		}
		
		bw.close();
	}
}